### This script contains functions that load raw survey data

load_raw_data_all_years <- function(external_path, country, alt = F) {
  # Load raw data for countries with all years in the same dataset
  if (country == "germany") {
    # Load survey data for all years
    db_all <- read.dta13(sprintf("%s/%s/all/survey.dta", external_path, country), encoding = "UTF-8")
    
    # Drop duplicated respondents
    db_all <- db_all %>% dplyr::distinct(v2, v4, V4a, v80, .keep_all = TRUE)
    
    if (alt){
      # Select East Germany 
      db_all <- db_all %>% filter(db_all$V4a == "Ost") 
    } else {
      # Drop observations from East Germany
      db_all <- db_all %>% filter(db_all$V4a != "Ost")
    }
   
    # Drop observations with varying versions of questionaires
    db_all <- db_all %>% filter(db_all$v79 == "nicht erhoben")
    
    # Separate year variable v4 from others to keep in character form
    v4       <- as.character(db_all$v4)
    # Variables: v3 = month, v8-v14 = affects, v72 = party ID, v78 = weight, v79 = group, v22 = ideology
    db_all   <- subset(db_all, select = c(v3, v8, v9, v10, v11, v12, v13, v14, v22, v72, v78, v79))
    db_all[] <- lapply(db_all, function(x) {if(is.factor(x)) as.numeric(x) else x})
    # Add year variable back to dataset
    db_all   <- cbind(db_all, v4)  %>%
      dplyr::rename(year = v4)
    
  } else if (country == "switzerland") {
    db_all <- read.spss(sprintf("%s/%s/all/survey.sav", external_path, country), to.data.frame = TRUE)
    db_all$pid2b[db_all$pid1 == "no"] <- "no party identification" # Per `aggregate(db_all$pid2b == "no party identification", by = list(db_all$year, db_all$pid1), FUN = mean, na.rm = T)`, 2011 and 2015 incorrectly assign party ids to people that respondend "no" for whether or not they have a party identification.
    
    db_all <- subset(db_all, select = c(year, sypa1, sypa2, sypa3, sypa4, sypa5, sypa6, sypa7, sypa8, sypa9,
                                        sypa10, sypa11, sypa12, sypa14, sypa15, weighttot, pid2b, pid4b, lr1))
    
  } else if (country == "united_states") {
    db_all <- read.spss(sprintf("%s/%s/survey.sav", external_path, country), to.data.frame = TRUE)  %>%
      dplyr::rename(year = VCF0004)
    
  } else if (country == "cses"){
    # CSES
    db_all      <- read.csv(sprintf("%s/cses_data.csv", build_path))
    db_all$year <- sprintf("%s_%s", db_all$country, db_all$year)
    db_all      <- data.frame(db_all)
  }
  
  return(db_all)
}

load_raw_data_by_year <- function(yr, format, i, external_path, country, db_all = NULL) {
  
  if (country %in% c("germany", "switzerland", "united_states", "cses")) {
    # Subset db_all
    db     <- subset(db_all, year == yr)
    if (yr == 2020 & country == "united_states") {
      db <- read.dta13(sprintf("%s/anes/2020/anes_timeseries_2020_stata_20210211.dta", external_path))
    }
  } else {
    # Load raw data for countries with separate dataset for each year
    if (country %in% c("norway", "sweden") ) {
      format <- gsub("sav", "dta", format)
    }
    if (format[i] == 'dta') {
      if (country == "norway" & yr != 2001) {
        db <- read.dta13(sprintf("%s/%s/%s/survey.dta", external_path, country, yr), generate.factors = F)
      } else {
        db <- read.dta13(sprintf("%s/%s/%s/survey.dta", external_path, country, yr), generate.factors = T)
      } 
    } else if (format[i] == 'csv'){
      db <- read.csv(sprintf("%s/%s/%s/survey.csv", external_path, country, yr))
    } else {
      db <- read.spss(sprintf("%s/%s/%s/survey.sav",  external_path, country, yr), to.data.frame = TRUE)
    }
  }
  
  return(db)
}
